In [1]:

    
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
from scipy import stats

import warnings
warnings.filterwarnings('ignore')



In [2]:

    
import tabulate

Load and combine the data



In [3]:

    
SPLITS = ['train', 'test', 'dev']



In [4]:

    
df = {}
for split in SPLITS:
    df[split] = pd.read_csv('../../../../data/annotations/split/' + split + '/annotations.tsv', sep = '\t', encoding = 'utf-8')



In [5]:

    
combined_df = pd.concat(df.values())

Aggregate the Data



In [6]:

    
SOURCES = ['article_blocked', 'article_random', 'user_blocked', 'user_random']



In [7]:

    
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean', 
            'attack': 'mean', 'aggression': 'mean'}
grouped_df = combined_df.groupby('rev_id').agg(agg_dict)



In [8]:

    
grouped_source_df = {}
for source in SOURCES:
    grouped_source_df[source] = grouped_df[grouped_df['src'].str.contains(source)]
grouped_source_df['total'] = grouped_df

Compute the values of interest



In [9]:

    
number_of_revisions = {k: len(v) for k, v in grouped_source_df.items()}



In [10]:

    
number_of_revisions









    Out[10]:





{'article_blocked': 31421,
 'article_random': 19571,
 'total': 115737,
 'user_blocked': 46705,
 'user_random': 18040}



In [11]:

    
# Next compute proportion of aggressive and attacking revisions for each source
num = {}
perc = {}
for term in ['attack', 'aggression']:
    num[term] = {}
    perc[term] = {}
    for source in SOURCES + ['total']:
        num[term][source] = len(grouped_source_df[source].query('%s > 0.5' % term))
        perc[term][source] = num[term][source]/number_of_revisions[source]



In [12]:

    
for term in ['attack', 'aggression']:
    print(term)
    print(num[term])
    print(perc[term])









    



attack
{'user_random': 190, 'article_random': 150, 'user_blocked': 11147, 'total': 13575, 'article_blocked': 2088}
{'user_random': 0.010532150776053215, 'article_random': 0.00766440141024986, 'user_blocked': 0.23866823680548122, 'total': 0.11729179087068094, 'article_blocked': 0.06645237261703955}
aggression
{'user_random': 247, 'article_random': 207, 'user_blocked': 11849, 'total': 14760, 'article_blocked': 2457}
{'user_random': 0.013691796008869179, 'article_random': 0.010576873946144805, 'user_blocked': 0.2536987474574457, 'total': 0.12753052178646415, 'article_blocked': 0.07819611088125776}

Inter Annotator Agreement



In [13]:

    
dat = combined_df



In [14]:

    
dat.columns









    Out[14]:





Index(['rev_id', '_worker_id', 'ns', 'sample', 'src', 'clean_diff', 'diff',
       'insert_only', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression'],
      dtype='object')



In [15]:

    
ATTACK_COLUMNS = ['attack_bool', 'not_attack']
dat['attack_bool'] = (dat['attack'] > 0.5).apply(int)
dat['not_attack'] = 1-dat['attack_bool']
AGGRESSIVE_COLUMNS = ['aggressive_bool', 'not_aggressive']
dat['aggressive_bool'] = (dat['aggression'] > 0.5).apply(int)
dat['not_aggressive'] = 1-dat['aggressive_bool']



In [16]:

    
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean', 
            'attack': 'mean', 'aggression': 'mean'}
agg_dict.update(dict.fromkeys(ATTACK_COLUMNS, 'sum'))
agg_dict.update(dict.fromkeys(AGGRESSIVE_COLUMNS, 'sum'))
ia_df  = dat.groupby('rev_id').agg(agg_dict)



In [17]:

    
%load_ext autoreload
%autoreload 2
from krippendorf_alpha import *



In [18]:

    
print('Attack: ')
print(Krippendorf_alpha(ia_df, ATTACK_COLUMNS))
print('Aggression: ')
print(Krippendorf_alpha(ia_df, AGGRESSIVE_COLUMNS))









    



Attack: 
0.451278401328
Aggression: 
0.438842898582



In [ ]: